set.seed(1)
project_2_data =
read_csv("project_2_data.csv", na = c("NA", "", ".")) |>
janitor::clean_names() |>
mutate(status = ifelse(status == "Dead", 1, 0))
## Rows: 4024 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Race, Marital Status, T Stage, N Stage, 6th Stage, differentiate, ...
## dbl (5): Age, Tumor Size, Regional Node Examined, Reginol Node Positive, Su...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dead <- project_2_data |>
filter(status == 1)
alive <- project_2_data |>
filter(status == 0) |>
sample_n(nrow(dead))
project_2_numdata<-
bind_rows(dead, alive) |>
mutate(
t_stage = case_when(
t_stage == "T1" ~ 1,
t_stage == "T2" ~ 2,
t_stage == "T3" ~ 3,
t_stage == "T4" ~ 4,
TRUE ~ NA_real_),
n_stage = case_when(
n_stage == "N1" ~ 1,
n_stage == "N2" ~ 2,
n_stage == "N3" ~ 3,
TRUE ~ NA_real_),
differentiate = case_when(
differentiate == "Well differentiated" ~ 1,
differentiate == "Moderately differentiated" ~ 2,
differentiate == "Poorly differentiated" ~ 3,
differentiate == "Undifferentiated" ~ 4,
TRUE ~ NA_real_),
grade = case_when(
grade == "anaplastic; Grade IV" ~ 4,
grade == "3" ~ 3,
grade == "2" ~ 2,
grade == "1" ~ 1,
TRUE ~ NA_real_),
a_stage_regional = ifelse(a_stage == "Regional", 1, 0),
estrogen_status = ifelse(estrogen_status == "Positive", 1, 0),
progesterone_status = ifelse(progesterone_status == "Positive", 1, 0)
) |>
select(-a_stage)
variables <- names(project_2_numdata)[!names(project_2_numdata) %in% c("survival_months", "status")]
for (var in variables) {
formula <- as.formula(paste("Surv(survival_months, status) ~", var))
model <- coxph(formula, data = project_2_numdata)
print(var)
print(summary(model))
}
## [1] "age"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.013313 1.013402 0.004454 2.989 0.0028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.013 0.9868 1.005 1.022
##
## Concordance= 0.536 (se = 0.012 )
## Likelihood ratio test= 9.05 on 1 df, p=0.003
## Wald test = 8.93 on 1 df, p=0.003
## Score (logrank) test = 8.95 on 1 df, p=0.003
##
## [1] "race"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## raceOther -0.5965 0.5508 0.2099 -2.842 0.00448 **
## raceWhite -0.3263 0.7216 0.1252 -2.607 0.00914 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## raceOther 0.5508 1.816 0.3650 0.8310
## raceWhite 0.7216 1.386 0.5646 0.9222
##
## Concordance= 0.527 (se = 0.008 )
## Likelihood ratio test= 9.46 on 2 df, p=0.009
## Wald test = 9.81 on 2 df, p=0.007
## Score (logrank) test = 9.92 on 2 df, p=0.007
##
## [1] "marital_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## marital_statusMarried -0.23876 0.78761 0.11794 -2.024 0.0429 *
## marital_statusSeparated 0.53873 1.71383 0.27908 1.930 0.0536 .
## marital_statusSingle -0.10841 0.89726 0.14404 -0.753 0.4517
## marital_statusWidowed 0.04877 1.04998 0.17757 0.275 0.7836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## marital_statusMarried 0.7876 1.2697 0.6251 0.9924
## marital_statusSeparated 1.7138 0.5835 0.9918 2.9616
## marital_statusSingle 0.8973 1.1145 0.6766 1.1899
## marital_statusWidowed 1.0500 0.9524 0.7414 1.4871
##
## Concordance= 0.537 (se = 0.011 )
## Likelihood ratio test= 12.55 on 4 df, p=0.01
## Wald test = 14 on 4 df, p=0.007
## Score (logrank) test = 14.36 on 4 df, p=0.006
##
## [1] "t_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## t_stage 0.38440 1.46874 0.04713 8.156 3.48e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## t_stage 1.469 0.6809 1.339 1.611
##
## Concordance= 0.582 (se = 0.011 )
## Likelihood ratio test= 62.94 on 1 df, p=2e-15
## Wald test = 66.51 on 1 df, p=3e-16
## Score (logrank) test = 67.36 on 1 df, p=2e-16
##
## [1] "n_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## n_stage 0.61870 1.85652 0.04835 12.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## n_stage 1.857 0.5386 1.689 2.041
##
## Concordance= 0.623 (se = 0.011 )
## Likelihood ratio test= 152 on 1 df, p=<2e-16
## Wald test = 163.7 on 1 df, p=<2e-16
## Score (logrank) test = 175.6 on 1 df, p=<2e-16
##
## [1] "x6th_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## x6th_stageIIB 0.4390 1.5512 0.1336 3.287 0.00101 **
## x6th_stageIIIA 0.7669 2.1530 0.1260 6.088 1.15e-09 ***
## x6th_stageIIIB 1.4792 4.3893 0.2464 6.003 1.94e-09 ***
## x6th_stageIIIC 1.5199 4.5718 0.1274 11.934 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## x6th_stageIIB 1.551 0.6447 1.194 2.015
## x6th_stageIIIA 2.153 0.4645 1.682 2.756
## x6th_stageIIIB 4.389 0.2278 2.708 7.115
## x6th_stageIIIC 4.572 0.2187 3.562 5.868
##
## Concordance= 0.637 (se = 0.012 )
## Likelihood ratio test= 171.5 on 4 df, p=<2e-16
## Wald test = 177.7 on 4 df, p=<2e-16
## Score (logrank) test = 198.8 on 4 df, p=<2e-16
##
## [1] "differentiate"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## differentiate 0.44124 1.55463 0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## differentiate 1.555 0.6432 1.37 1.764
##
## Concordance= 0.574 (se = 0.011 )
## Likelihood ratio test= 48.05 on 1 df, p=4e-12
## Wald test = 47.04 on 1 df, p=7e-12
## Score (logrank) test = 47.37 on 1 df, p=6e-12
##
## [1] "grade"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## grade 0.44124 1.55463 0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## grade 1.555 0.6432 1.37 1.764
##
## Concordance= 0.574 (se = 0.011 )
## Likelihood ratio test= 48.05 on 1 df, p=4e-12
## Wald test = 47.04 on 1 df, p=7e-12
## Score (logrank) test = 47.37 on 1 df, p=6e-12
##
## [1] "tumor_size"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## tumor_size 0.011933 1.012005 0.001574 7.58 3.46e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## tumor_size 1.012 0.9881 1.009 1.015
##
## Concordance= 0.592 (se = 0.012 )
## Likelihood ratio test= 50.4 on 1 df, p=1e-12
## Wald test = 57.45 on 1 df, p=3e-14
## Score (logrank) test = 58.25 on 1 df, p=2e-14
##
## [1] "estrogen_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## estrogen_status -0.9108 0.4022 0.1064 -8.562 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## estrogen_status 0.4022 2.486 0.3265 0.4954
##
## Concordance= 0.565 (se = 0.008 )
## Likelihood ratio test= 60.02 on 1 df, p=9e-15
## Wald test = 73.31 on 1 df, p=<2e-16
## Score (logrank) test = 78.48 on 1 df, p=<2e-16
##
## [1] "progesterone_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## progesterone_status -0.71314 0.49010 0.08583 -8.308 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## progesterone_status 0.4901 2.04 0.4142 0.5799
##
## Concordance= 0.588 (se = 0.01 )
## Likelihood ratio test= 62.82 on 1 df, p=2e-15
## Wald test = 69.03 on 1 df, p=<2e-16
## Score (logrank) test = 71.97 on 1 df, p=<2e-16
##
## [1] "regional_node_examined"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## regional_node_examined 0.010506 1.010561 0.004982 2.109 0.035 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## regional_node_examined 1.011 0.9895 1.001 1.02
##
## Concordance= 0.521 (se = 0.013 )
## Likelihood ratio test= 4.35 on 1 df, p=0.04
## Wald test = 4.45 on 1 df, p=0.03
## Score (logrank) test = 4.44 on 1 df, p=0.04
##
## [1] "reginol_node_positive"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## reginol_node_positive 0.054926 1.056462 0.004627 11.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## reginol_node_positive 1.056 0.9466 1.047 1.066
##
## Concordance= 0.627 (se = 0.012 )
## Likelihood ratio test= 106.3 on 1 df, p=<2e-16
## Wald test = 140.9 on 1 df, p=<2e-16
## Score (logrank) test = 148 on 1 df, p=<2e-16
##
## [1] "a_stage_regional"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## a_stage_regional -1.1046 0.3314 0.1747 -6.324 2.55e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## a_stage_regional 0.3314 3.018 0.2353 0.4666
##
## Concordance= 0.522 (se = 0.005 )
## Likelihood ratio test= 29.49 on 1 df, p=6e-08
## Wald test = 39.99 on 1 df, p=3e-10
## Score (logrank) test = 44.22 on 1 df, p=3e-11
All variables are statistically significant except for marriage in “marital_status”
project_2_numdata <-
project_2_numdata|>
mutate(Married = ifelse(marital_status == "Married", 1, 0)) |>
select(-differentiate, - marital_status)
cor_matrix <- cor(project_2_numdata[, sapply(project_2_numdata, is.numeric)])
print(cor_matrix)
## age t_stage n_stage grade
## age 1.00000000 -0.050113404 -0.01706295 -0.12321448
## t_stage -0.05011340 1.000000000 0.30612098 0.15076838
## n_stage -0.01706295 0.306120984 1.00000000 0.17991601
## grade -0.12321448 0.150768380 0.17991601 1.00000000
## tumor_size -0.07953275 0.784997546 0.30277440 0.14403058
## estrogen_status 0.12111658 -0.106722673 -0.12700846 -0.26247316
## progesterone_status 0.01233580 -0.071974613 -0.11108594 -0.21201525
## regional_node_examined -0.03141638 0.088442919 0.36808270 0.07651544
## reginol_node_positive 0.01628318 0.244266968 0.82874594 0.15163577
## survival_months -0.08599693 -0.159716020 -0.24737291 -0.15052637
## status 0.06976636 0.225250913 0.34556514 0.19151006
## a_stage_regional 0.04777171 -0.234675561 -0.25575034 -0.04874199
## Married -0.04796418 0.003435462 -0.05151826 -0.02717180
## tumor_size estrogen_status progesterone_status
## age -0.07953275 0.121116575 0.01233580
## t_stage 0.78499755 -0.106722673 -0.07197461
## n_stage 0.30277440 -0.127008462 -0.11108594
## grade 0.14403058 -0.262473159 -0.21201525
## tumor_size 1.00000000 -0.123020668 -0.08438705
## estrogen_status -0.12302067 1.000000000 0.58504596
## progesterone_status -0.08438705 0.585045962 1.00000000
## regional_node_examined 0.11473470 -0.041404301 -0.01390137
## reginol_node_positive 0.24878553 -0.100873585 -0.07105526
## survival_months -0.16151178 0.227274035 0.21200308
## status 0.19535827 -0.175795073 -0.20001424
## a_stage_regional -0.12796649 0.127990099 0.07311096
## Married -0.01036450 0.008579762 -0.01351800
## regional_node_examined reginol_node_positive
## age -0.03141638 0.01628318
## t_stage 0.08844292 0.24426697
## n_stage 0.36808270 0.82874594
## grade 0.07651544 0.15163577
## tumor_size 0.11473470 0.24878553
## estrogen_status -0.04140430 -0.10087358
## progesterone_status -0.01390137 -0.07105526
## regional_node_examined 1.00000000 0.49395643
## reginol_node_positive 0.49395643 1.00000000
## survival_months -0.03087185 -0.21263747
## status 0.06199862 0.31134369
## a_stage_regional -0.04237809 -0.19505294
## Married 0.02581466 -0.03581806
## survival_months status a_stage_regional
## age -0.08599693 0.06976636 0.04777171
## t_stage -0.15971602 0.22525091 -0.23467556
## n_stage -0.24737291 0.34556514 -0.25575034
## grade -0.15052637 0.19151006 -0.04874199
## tumor_size -0.16151178 0.19535827 -0.12796649
## estrogen_status 0.22727403 -0.17579507 0.12799010
## progesterone_status 0.21200308 -0.20001424 0.07311096
## regional_node_examined -0.03087185 0.06199862 -0.04237809
## reginol_node_positive -0.21263747 0.31134369 -0.19505294
## survival_months 1.00000000 -0.58302039 0.13738966
## status -0.58302039 1.00000000 -0.13123516
## a_stage_regional 0.13738966 -0.13123516 1.00000000
## Married 0.06161119 -0.08197981 0.03226076
## Married
## age -0.047964181
## t_stage 0.003435462
## n_stage -0.051518259
## grade -0.027171798
## tumor_size -0.010364496
## estrogen_status 0.008579762
## progesterone_status -0.013518004
## regional_node_examined 0.025814656
## reginol_node_positive -0.035818059
## survival_months 0.061611186
## status -0.081979813
## a_stage_regional 0.032260758
## Married 1.000000000
cortable<-project_2_numdata|>
select(-race,-x6th_stage)
cortable$regional_node_examined
## [1] 9 9 12 15 16 23 24 8 5 21 4 3 33 7 11 3 16 8 28 20 14 23 20 13
## [25] 15 14 20 17 11 23 3 34 1 17 17 4 12 27 14 20 28 29 18 3 4 13 23 18
## [49] 7 41 11 10 27 19 5 24 2 12 16 15 40 9 10 19 17 3 19 16 7 25 1 9
## [73] 10 15 9 11 29 21 17 17 7 15 22 47 22 14 9 9 13 14 10 19 15 6 2 54
## [97] 5 31 28 21 19 8 13 13 9 18 12 15 27 24 5 7 22 17 16 12 9 23 10 6
## [121] 19 5 15 10 30 18 15 27 14 3 36 16 5 9 3 32 31 21 18 10 3 20 10 25
## [145] 9 14 9 29 32 19 27 1 11 13 14 21 11 15 16 22 2 14 17 5 10 23 10 3
## [169] 17 18 18 23 9 9 8 23 19 12 9 26 12 15 23 9 14 22 16 30 4 2 10 13
## [193] 1 10 4 15 7 5 28 12 18 28 18 12 6 18 26 3 21 36 26 18 18 28 16 30
## [217] 2 1 21 14 21 4 10 16 19 15 29 15 34 23 27 17 13 12 3 13 14 36 11 12
## [241] 9 9 5 13 11 20 10 14 16 47 15 16 29 16 9 7 11 6 16 18 22 18 4 25
## [265] 29 1 13 13 9 21 25 10 2 8 21 2 26 17 4 9 13 16 10 13 1 6 9 9
## [289] 18 22 8 15 8 4 4 18 12 17 15 19 16 27 15 10 14 13 12 11 8 14 21 13
## [313] 17 4 19 12 17 7 25 37 8 24 13 22 15 3 13 3 2 19 12 11 11 2 20 12
## [337] 19 11 15 12 21 10 10 17 3 15 17 15 17 11 8 18 21 18 10 29 9 18 13 20
## [361] 35 23 12 7 32 18 18 5 28 10 10 18 11 9 9 1 30 3 7 9 19 57 5 22
## [385] 12 15 12 3 8 15 25 10 4 26 25 16 23 13 17 8 24 9 10 9 36 19 12 8
## [409] 21 8 10 30 17 11 12 20 22 14 4 11 1 4 18 20 12 19 15 21 16 18 11 16
## [433] 17 5 16 23 7 5 19 9 22 22 8 10 19 11 11 7 12 8 13 12 24 12 16 16
## [457] 20 13 3 30 22 35 11 10 7 30 29 14 12 17 2 24 15 10 9 12 4 19 23 28
## [481] 17 14 27 12 16 11 19 22 16 8 9 10 20 15 4 24 19 18 12 4 9 17 22 2
## [505] 11 6 2 16 21 20 12 17 21 7 15 13 6 4 9 30 2 29 13 3 8 5 14 6
## [529] 15 1 14 11 21 34 11 18 13 7 11 11 4 16 19 20 12 13 30 19 17 12 12 15
## [553] 12 9 12 6 8 2 12 28 14 24 24 4 26 7 18 8 18 13 22 9 28 21 10 13
## [577] 6 16 12 2 15 16 17 17 23 25 47 21 19 18 18 3 31 11 2 16 14 16 23 13
## [601] 14 7 10 7 16 25 28 10 16 22 23 21 21 19 6 2 8 6 12 2 5 14 15 8
## [625] 3 23 19 19 15 7 7 14 10 10 14 31 17 25 7 12 15 12 4 17 19 24 14 24
## [649] 6 25 6 19 34 16 3 7 15 19 2 26 9 21 2 9 13 10 17 13 4 15 5 13
## [673] 21 5 18 6 21 11 18 11 7 4 5 5 13 18 17 5 7 14 23 11 15 13 13 10
## [697] 43 47 20 14 9 26 7 12 28 1 5 18 18 12 11 20 41 23 10 20 16 21 26 12
## [721] 6 17 12 11 20 22 26 10 8 15 11 4 16 13 21 14 17 9 10 8 3 12 22 37
## [745] 18 7 11 23 13 25 12 17 29 1 12 7 1 23 4 10 25 13 1 9 9 11 26 9
## [769] 18 3 8 24 9 14 24 18 2 15 24 9 23 27 16 15 18 16 17 25 15 15 18 15
## [793] 12 2 13 20 9 13 16 15 15 9 23 13 2 20 19 29 22 2 15 9 23 7 12 3
## [817] 13 1 31 15 20 12 18 10 15 7 23 24 13 8 12 13 20 7 22 4 11 14 16 17
## [841] 10 13 11 19 17 15 25 15 19 9 20 16 14 6 14 11 13 16 7 17 16 13 5 35
## [865] 1 20 20 5 5 26 15 15 16 20 13 14 7 15 27 22 30 4 34 1 20 14 18 10
## [889] 16 8 29 4 9 8 3 10 25 18 20 9 17 28 15 20 3 18 9 21 14 7 1 12
## [913] 15 6 17 35 13 25 18 8 16 26 5 8 9 25 27 13 3 16 29 6 14 3 17 16
## [937] 17 32 8 16 3 2 16 20 24 13 19 19 11 3 14 12 8 6 1 23 7 2 4 13
## [961] 12 6 37 15 17 2 14 11 3 12 27 6 19 4 19 12 14 18 15 12 20 10 21 8
## [985] 17 17 8 20 15 10 6 21 6 8 17 24 7 13 14 27 15 23 15 11 1 24 13 26
## [1009] 16 18 5 16 28 19 12 14 14 23 17 18 31 12 11 9 20 16 18 18 2 13 4 14
## [1033] 12 13 4 3 2 8 11 3 2 2 2 13 10 1 16 12 18 5 28 16 13 15 7 22
## [1057] 17 11 10 16 14 9 8 16 15 6 18 14 15 14 3 22 38 3 5 17 9 23 14 13
## [1081] 9 24 9 9 6 16 18 17 10 24 8 27 7 22 10 12 28 14 9 7 17 26 11 23
## [1105] 9 11 12 32 32 6 13 16 8 19 37 3 18 14 10 21 13 12 15 9 6 7 9 13
## [1129] 31 13 18 4 6 18 12 3 16 11 3 29 26 14 9 8 29 18 27 11 20 19 9 16
## [1153] 13 3 9 2 12 10 7 12 25 26 18 14 17 18 9 9 12 13 14 18 30 14 6 15
## [1177] 2 19 16 14 9 10 9 5 7 11 4 14 10 4 9 12 12 3 16 16 17 24 9 11
## [1201] 16 19 11 14 3 4 5 5 17 12 16 22 11 10 8 18 21 10 4 25 3 19 14 16
## [1225] 15 16 16 11 18 13 11 12
chart.Correlation(cortable, histogram=TRUE, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
We have to choice 1 between t_stage - tumor size n stage -
regional_node_positive
muldata = project_2_numdata
cox_model <- coxph(Surv(survival_months, status) ~.,
data = muldata)
vifs <- vif(cox_model)
## Warning in vif.default(cox_model): No intercept: vifs may not be sensible.
print(vifs)
## GVIF Df GVIF^(1/(2*Df))
## age 1.078511 1 1.038514
## race 1.089620 2 1.021689
## t_stage 4.204881 1 2.050581
## n_stage 16.052408 1 4.006546
## x6th_stage 42.723885 4 1.598946
## grade 1.132422 1 1.064153
## tumor_size 2.891806 1 1.700531
## estrogen_status 1.705080 1 1.305787
## progesterone_status 1.579316 1 1.256708
## regional_node_examined 1.766667 1 1.329160
## reginol_node_positive 4.281220 1 2.069111
## a_stage_regional 1.337110 1 1.156335
## Married 1.094899 1 1.046374
remove t-stage,n_stage and regional_mode_positive due to higher VIF score
final = muldata |>
select(-t_stage, -n_stage,-reginol_node_positive)
cox_model1 <- coxph(Surv(survival_months, status) ~.,
data = final)
vifs <- vif(cox_model1)
## Warning in vif.default(cox_model1): No intercept: vifs may not be sensible.
print(vifs)
## GVIF Df GVIF^(1/(2*Df))
## age 1.065375 1 1.032170
## race 1.084396 2 1.020462
## x6th_stage 2.150931 4 1.100470
## grade 1.123660 1 1.060028
## tumor_size 1.298750 1 1.139627
## estrogen_status 1.647493 1 1.283547
## progesterone_status 1.541107 1 1.241413
## regional_node_examined 1.247171 1 1.116768
## a_stage_regional 1.327119 1 1.152007
## Married 1.093934 1 1.045913
summary(cox_model)
## Call:
## coxph(formula = Surv(survival_months, status) ~ ., data = muldata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.019327 1.019515 0.004705 4.108 3.99e-05 ***
## raceOther -0.435919 0.646670 0.213742 -2.039 0.04140 *
## raceWhite -0.261756 0.769699 0.129732 -2.018 0.04363 *
## t_stage -0.027546 0.972830 0.096358 -0.286 0.77498
## n_stage 0.453734 1.574180 0.193949 2.339 0.01931 *
## x6th_stageIIB 0.410796 1.508018 0.151806 2.706 0.00681 **
## x6th_stageIIIA 0.246022 1.278928 0.235577 1.044 0.29633
## x6th_stageIIIB 0.836403 2.308051 0.388854 2.151 0.03148 *
## x6th_stageIIIC 0.138145 1.148142 0.450956 0.306 0.75935
## grade 0.278566 1.321234 0.068832 4.047 5.19e-05 ***
## tumor_size 0.005146 1.005160 0.002796 1.841 0.06569 .
## estrogen_status -0.340392 0.711491 0.139382 -2.442 0.01460 *
## progesterone_status -0.449035 0.638244 0.108196 -4.150 3.32e-05 ***
## regional_node_examined -0.021173 0.979050 0.006570 -3.222 0.00127 **
## reginol_node_positive 0.027317 1.027694 0.011106 2.460 0.01391 *
## a_stage_regional -0.305088 0.737058 0.202176 -1.509 0.13129
## Married -0.114324 0.891969 0.085494 -1.337 0.18115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0195 0.9809 1.0102 1.0290
## raceOther 0.6467 1.5464 0.4253 0.9832
## raceWhite 0.7697 1.2992 0.5969 0.9925
## t_stage 0.9728 1.0279 0.8054 1.1751
## n_stage 1.5742 0.6353 1.0764 2.3022
## x6th_stageIIB 1.5080 0.6631 1.1199 2.0306
## x6th_stageIIIA 1.2789 0.7819 0.8060 2.0294
## x6th_stageIIIB 2.3081 0.4333 1.0771 4.9458
## x6th_stageIIIC 1.1481 0.8710 0.4744 2.7788
## grade 1.3212 0.7569 1.1545 1.5121
## tumor_size 1.0052 0.9949 0.9997 1.0107
## estrogen_status 0.7115 1.4055 0.5414 0.9350
## progesterone_status 0.6382 1.5668 0.5163 0.7890
## regional_node_examined 0.9790 1.0214 0.9665 0.9917
## reginol_node_positive 1.0277 0.9731 1.0056 1.0503
## a_stage_regional 0.7371 1.3567 0.4959 1.0955
## Married 0.8920 1.1211 0.7544 1.0547
##
## Concordance= 0.696 (se = 0.011 )
## Likelihood ratio test= 302.6 on 17 df, p=<2e-16
## Wald test = 323.8 on 17 df, p=<2e-16
## Score (logrank) test = 354.9 on 17 df, p=<2e-16
ph_test <- cox.zph(cox_model1)
print(ph_test)
## chisq df p
## age 0.1978 1 0.66
## race 0.9493 2 0.62
## x6th_stage 7.3617 4 0.12
## grade 0.1293 1 0.72
## tumor_size 0.0858 1 0.77
## estrogen_status 23.1261 1 1.5e-06
## progesterone_status 19.6293 1 9.4e-06
## regional_node_examined 0.0880 1 0.77
## a_stage_regional 0.6691 1 0.41
## Married 2.3184 1 0.13
## GLOBAL 47.1764 14 1.8e-05
plot(ph_test)
The Cox model assumes that hazard ratios are constant over time. A non-significant p-value (p > 0.05) indicates that the PH assumption holds. As GLOBAL 47.176 15 1.8e-05, the model did not meet the assumption, as same as estrogen_status and progesterone_status.We need further improve our model.
cox_model2 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade +
tumor_size + regional_node_examined + a_stage_regional+Married,
data = final,x = TRUE)
summary(cox_model2)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race +
## x6th_stage + grade + tumor_size + regional_node_examined +
## a_stage_regional + Married, data = final, x = TRUE)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.018335 1.018504 0.004590 3.994 6.49e-05 ***
## raceOther -0.464884 0.628208 0.213238 -2.180 0.02925 *
## raceWhite -0.328808 0.719781 0.129044 -2.548 0.01083 *
## x6th_stageIIB 0.408660 1.504799 0.137857 2.964 0.00303 **
## x6th_stageIIIA 0.716327 2.046901 0.139572 5.132 2.86e-07 ***
## x6th_stageIIIB 1.202920 3.329827 0.273778 4.394 1.11e-05 ***
## x6th_stageIIIC 1.451305 4.268680 0.155308 9.345 < 2e-16 ***
## grade 0.368143 1.445048 0.066173 5.563 2.65e-08 ***
## tumor_size 0.002535 1.002539 0.001888 1.343 0.17931
## regional_node_examined -0.014263 0.985838 0.005689 -2.507 0.01217 *
## a_stage_regional -0.332487 0.717138 0.196343 -1.693 0.09038 .
## Married -0.132085 0.876267 0.084540 -1.562 0.11819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0185 0.9818 1.0094 1.0277
## raceOther 0.6282 1.5918 0.4136 0.9541
## raceWhite 0.7198 1.3893 0.5589 0.9269
## x6th_stageIIB 1.5048 0.6645 1.1485 1.9716
## x6th_stageIIIA 2.0469 0.4885 1.5570 2.6909
## x6th_stageIIIB 3.3298 0.3003 1.9471 5.6946
## x6th_stageIIIC 4.2687 0.2343 3.1484 5.7875
## grade 1.4450 0.6920 1.2693 1.6452
## tumor_size 1.0025 0.9975 0.9988 1.0063
## regional_node_examined 0.9858 1.0144 0.9749 0.9969
## a_stage_regional 0.7171 1.3944 0.4881 1.0537
## Married 0.8763 1.1412 0.7425 1.0342
##
## Concordance= 0.67 (se = 0.012 )
## Likelihood ratio test= 239.8 on 12 df, p=<2e-16
## Wald test = 247.9 on 12 df, p=<2e-16
## Score (logrank) test = 271.8 on 12 df, p=<2e-16
ph_test <- cox.zph(cox_model2)
print(ph_test)
## chisq df p
## age 2.87e-01 1 0.592
## race 8.46e-01 2 0.655
## x6th_stage 5.72e+00 4 0.221
## grade 3.61e-01 1 0.548
## tumor_size 6.89e-04 1 0.979
## regional_node_examined 5.56e-03 1 0.941
## a_stage_regional 1.36e+00 1 0.244
## Married 2.90e+00 1 0.089
## GLOBAL 1.63e+01 12 0.178
plot(ph_test)
now the assumption is been fully achieved after delete estrogen_status
and progesterone_status
cox_model3 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade +
log2(tumor_size+1) + log2(regional_node_examined+1) +a_stage_regional + Married,
data = final)
summary(cox_model3)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race +
## x6th_stage + grade + log2(tumor_size + 1) + log2(regional_node_examined +
## 1) + a_stage_regional + Married, data = final)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.018234 1.018401 0.004601 3.963 7.41e-05
## raceOther -0.485708 0.615262 0.213037 -2.280 0.02261
## raceWhite -0.337849 0.713303 0.128868 -2.622 0.00875
## x6th_stageIIB 0.365175 1.440766 0.146221 2.497 0.01251
## x6th_stageIIIA 0.698826 2.011389 0.149210 4.684 2.82e-06
## x6th_stageIIIB 1.168003 3.215565 0.274916 4.249 2.15e-05
## x6th_stageIIIC 1.443291 4.234611 0.161875 8.916 < 2e-16
## grade 0.363796 1.438780 0.066076 5.506 3.68e-08
## log2(tumor_size + 1) 0.092885 1.097335 0.052389 1.773 0.07623
## log2(regional_node_examined + 1) -0.167005 0.846196 0.051160 -3.264 0.00110
## a_stage_regional -0.336402 0.714336 0.195087 -1.724 0.08464
## Married -0.128061 0.879800 0.084592 -1.514 0.13006
##
## age ***
## raceOther *
## raceWhite **
## x6th_stageIIB *
## x6th_stageIIIA ***
## x6th_stageIIIB ***
## x6th_stageIIIC ***
## grade ***
## log2(tumor_size + 1) .
## log2(regional_node_examined + 1) **
## a_stage_regional .
## Married
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0184 0.9819 1.0093 1.0276
## raceOther 0.6153 1.6253 0.4052 0.9341
## raceWhite 0.7133 1.4019 0.5541 0.9183
## x6th_stageIIB 1.4408 0.6941 1.0818 1.9189
## x6th_stageIIIA 2.0114 0.4972 1.5014 2.6947
## x6th_stageIIIB 3.2156 0.3110 1.8761 5.5115
## x6th_stageIIIC 4.2346 0.2361 3.0834 5.8157
## grade 1.4388 0.6950 1.2640 1.6377
## log2(tumor_size + 1) 1.0973 0.9113 0.9903 1.2160
## log2(regional_node_examined + 1) 0.8462 1.1818 0.7655 0.9354
## a_stage_regional 0.7143 1.3999 0.4874 1.0470
## Married 0.8798 1.1366 0.7454 1.0385
##
## Concordance= 0.672 (se = 0.012 )
## Likelihood ratio test= 245 on 12 df, p=<2e-16
## Wald test = 251.2 on 12 df, p=<2e-16
## Score (logrank) test = 275.3 on 12 df, p=<2e-16
Add log transformation to the tumor size and regional node_examined, now they are being statistically significant
cox_model4 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade +
log2(tumor_size+1) + log2(regional_node_examined+1) + +a_stage_regional+ Married + Married*log2(regional_node_examined+1),
data = final)
summary(cox_model4)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race +
## x6th_stage + grade + log2(tumor_size + 1) + log2(regional_node_examined +
## 1) + +a_stage_regional + Married + Married * log2(regional_node_examined +
## 1), data = final)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z
## age 0.018417 1.018588 0.004613 3.993
## raceOther -0.497477 0.608063 0.213151 -2.334
## raceWhite -0.348671 0.705625 0.129089 -2.701
## x6th_stageIIB 0.361952 1.436130 0.146214 2.476
## x6th_stageIIIA 0.698313 2.010357 0.149257 4.679
## x6th_stageIIIB 1.196914 3.309886 0.276906 4.322
## x6th_stageIIIC 1.446583 4.248571 0.161670 8.948
## grade 0.366175 1.442208 0.066117 5.538
## log2(tumor_size + 1) 0.092912 1.097365 0.052270 1.778
## log2(regional_node_examined + 1) -0.251965 0.777272 0.073308 -3.437
## a_stage_regional -0.333872 0.716146 0.195867 -1.705
## Married -0.676046 0.508624 0.356283 -1.897
## log2(regional_node_examined + 1):Married 0.146154 1.157375 0.092504 1.580
## Pr(>|z|)
## age 6.53e-05 ***
## raceOther 0.019600 *
## raceWhite 0.006913 **
## x6th_stageIIB 0.013305 *
## x6th_stageIIIA 2.89e-06 ***
## x6th_stageIIIB 1.54e-05 ***
## x6th_stageIIIC < 2e-16 ***
## grade 3.06e-08 ***
## log2(tumor_size + 1) 0.075478 .
## log2(regional_node_examined + 1) 0.000588 ***
## a_stage_regional 0.088272 .
## Married 0.057762 .
## log2(regional_node_examined + 1):Married 0.114113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95
## age 1.0186 0.9818 1.0094
## raceOther 0.6081 1.6446 0.4004
## raceWhite 0.7056 1.4172 0.5479
## x6th_stageIIB 1.4361 0.6963 1.0783
## x6th_stageIIIA 2.0104 0.4974 1.5005
## x6th_stageIIIB 3.3099 0.3021 1.9236
## x6th_stageIIIC 4.2486 0.2354 3.0948
## grade 1.4422 0.6934 1.2669
## log2(tumor_size + 1) 1.0974 0.9113 0.9905
## log2(regional_node_examined + 1) 0.7773 1.2866 0.6732
## a_stage_regional 0.7161 1.3964 0.4878
## Married 0.5086 1.9661 0.2530
## log2(regional_node_examined + 1):Married 1.1574 0.8640 0.9655
## upper .95
## age 1.0278
## raceOther 0.9234
## raceWhite 0.9088
## x6th_stageIIB 1.9127
## x6th_stageIIIA 2.6935
## x6th_stageIIIB 5.6953
## x6th_stageIIIC 5.8325
## grade 1.6418
## log2(tumor_size + 1) 1.2157
## log2(regional_node_examined + 1) 0.8974
## a_stage_regional 1.0513
## Married 1.0225
## log2(regional_node_examined + 1):Married 1.3874
##
## Concordance= 0.673 (se = 0.012 )
## Likelihood ratio test= 247.5 on 13 df, p=<2e-16
## Wald test = 252.5 on 13 df, p=<2e-16
## Score (logrank) test = 277.1 on 13 df, p=<2e-16
Noticing married seems insignificant in every variable, we tried to add interaction term with married and all the remaining variables, but the interaction term are all insignificant as well as the main marriage effect except for the interaction between married and log2(regional_node_examined) make the married main effect being statisticlally significant under 0.05 level
cox_model5 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade +
log2(tumor_size+1) + log2(regional_node_examined+1)+ a_stage_regional,
data = final)
summary(cox_model5)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race +
## x6th_stage + grade + log2(tumor_size + 1) + log2(regional_node_examined +
## 1) + a_stage_regional, data = final)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.018899 1.019079 0.004591 4.117 3.84e-05
## raceOther -0.529895 0.588667 0.211059 -2.511 0.01205
## raceWhite -0.379032 0.684524 0.126026 -3.008 0.00263
## x6th_stageIIB 0.352933 1.423236 0.146071 2.416 0.01568
## x6th_stageIIIA 0.684198 1.982182 0.149092 4.589 4.45e-06
## x6th_stageIIIB 1.137605 3.119289 0.273191 4.164 3.13e-05
## x6th_stageIIIC 1.439728 4.219548 0.161704 8.903 < 2e-16
## grade 0.367059 1.443483 0.066129 5.551 2.85e-08
## log2(tumor_size + 1) 0.098186 1.103168 0.052181 1.882 0.05989
## log2(regional_node_examined + 1) -0.167119 0.846099 0.051162 -3.266 0.00109
## a_stage_regional -0.347075 0.706752 0.194455 -1.785 0.07428
##
## age ***
## raceOther *
## raceWhite **
## x6th_stageIIB *
## x6th_stageIIIA ***
## x6th_stageIIIB ***
## x6th_stageIIIC ***
## grade ***
## log2(tumor_size + 1) .
## log2(regional_node_examined + 1) **
## a_stage_regional .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0191 0.9813 1.0100 1.0283
## raceOther 0.5887 1.6988 0.3892 0.8903
## raceWhite 0.6845 1.4609 0.5347 0.8763
## x6th_stageIIB 1.4232 0.7026 1.0689 1.8950
## x6th_stageIIIA 1.9822 0.5045 1.4799 2.6549
## x6th_stageIIIB 3.1193 0.3206 1.8261 5.3284
## x6th_stageIIIC 4.2195 0.2370 3.0734 5.7931
## grade 1.4435 0.6928 1.2680 1.6432
## log2(tumor_size + 1) 1.1032 0.9065 0.9959 1.2220
## log2(regional_node_examined + 1) 0.8461 1.1819 0.7654 0.9353
## a_stage_regional 0.7068 1.4149 0.4828 1.0346
##
## Concordance= 0.67 (se = 0.012 )
## Likelihood ratio test= 242.7 on 11 df, p=<2e-16
## Wald test = 248.1 on 11 df, p=<2e-16
## Score (logrank) test = 272.3 on 11 df, p=<2e-16
Since the interaction term gives no significant significant value, we directly drop the marriage variable
cox_model6 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade +
log2(regional_node_examined+1),
data = final)
summary(cox_model6)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race +
## x6th_stage + grade + log2(regional_node_examined + 1), data = final)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.017705 1.017863 0.004567 3.877 0.000106
## raceOther -0.513619 0.598326 0.210909 -2.435 0.014881
## raceWhite -0.367476 0.692480 0.125910 -2.919 0.003517
## x6th_stageIIB 0.461184 1.585950 0.134452 3.430 0.000603
## x6th_stageIIIA 0.821688 2.274336 0.129512 6.345 2.23e-10
## x6th_stageIIIB 1.413804 4.111567 0.247599 5.710 1.13e-08
## x6th_stageIIIC 1.622178 5.064110 0.140484 11.547 < 2e-16
## grade 0.374943 1.454908 0.066050 5.677 1.37e-08
## log2(regional_node_examined + 1) -0.173489 0.840726 0.051254 -3.385 0.000712
##
## age ***
## raceOther *
## raceWhite **
## x6th_stageIIB ***
## x6th_stageIIIA ***
## x6th_stageIIIB ***
## x6th_stageIIIC ***
## grade ***
## log2(regional_node_examined + 1) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0179 0.9825 1.0088 1.0270
## raceOther 0.5983 1.6713 0.3957 0.9046
## raceWhite 0.6925 1.4441 0.5410 0.8863
## x6th_stageIIB 1.5859 0.6305 1.2186 2.0641
## x6th_stageIIIA 2.2743 0.4397 1.7645 2.9315
## x6th_stageIIIB 4.1116 0.2432 2.5308 6.6798
## x6th_stageIIIC 5.0641 0.1975 3.8452 6.6693
## grade 1.4549 0.6873 1.2782 1.6560
## log2(regional_node_examined + 1) 0.8407 1.1894 0.7604 0.9296
##
## Concordance= 0.668 (se = 0.012 )
## Likelihood ratio test= 236.2 on 9 df, p=<2e-16
## Wald test = 238.6 on 9 df, p=<2e-16
## Score (logrank) test = 261 on 9 df, p=<2e-16
further drop variables tumor_size and a_stage_regional due to quite low significant value.
Model Evaluation
summary(final$survival_months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 43.00 62.00 61.24 82.00 107.00
library(timeROC)
cox_models <- list(cox_model1, cox_model2, cox_model3, cox_model4, cox_model5, cox_model6)
time_points <- c(2,43,62,82,107)
dynamic_auc_results <- list()
for (i in 1:length(cox_models)) {
current_model <- cox_models[[i]]
time_roc <- timeROC(T = final$survival_months,
delta = final$status,
marker = predict(current_model, type = "lp"),
cause = 1, # Event of interest
times = time_points, # Time points
iid = TRUE)
dynamic_auc_results[[paste0("Model_", i)]] <- time_roc$AUC
print(paste("AUC for Model", i, "at time points:", paste(time_points, collapse = ", ")))
print(round(time_roc$AUC, 3))
}
## [1] "AUC for Model 1 at time points: 2, 43, 62, 82, 107"
## t=2 t=43 t=62 t=82 t=107
## NA 0.746 0.721 0.724 NA
## [1] "AUC for Model 2 at time points: 2, 43, 62, 82, 107"
## t=2 t=43 t=62 t=82 t=107
## NA 0.702 0.698 0.721 NA
## [1] "AUC for Model 3 at time points: 2, 43, 62, 82, 107"
## t=2 t=43 t=62 t=82 t=107
## NA 0.703 0.701 0.723 NA
## [1] "AUC for Model 4 at time points: 2, 43, 62, 82, 107"
## t=2 t=43 t=62 t=82 t=107
## NA 0.705 0.703 0.724 NA
## [1] "AUC for Model 5 at time points: 2, 43, 62, 82, 107"
## t=2 t=43 t=62 t=82 t=107
## NA 0.700 0.700 0.722 NA
## [1] "AUC for Model 6 at time points: 2, 43, 62, 82, 107"
## t=2 t=43 t=62 t=82 t=107
## NA 0.696 0.696 0.721 NA
model 4 tends to have a relatively higher AUC throughout different time points
C-index
cox_model1$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.518110e+05 1.553440e+05 0.000000e+00 2.219000e+03 0.000000e+00 6.936952e-01
## std
## 1.132421e-02
cox_model2$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.399160e+05 1.672380e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.702418e-01
## std
## 1.160463e-02
cox_model3$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.409430e+05 1.662110e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.722669e-01
## std
## 1.158216e-02
cox_model4$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.414850e+05 1.656690e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.733356e-01
## std
## 1.160303e-02
cox_model5$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.398910e+05 1.672620e+05 2.000000e+00 2.219000e+03 0.000000e+00 6.701935e-01
## std
## 1.163085e-02
cox_model6$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.387630e+05 1.683460e+05 4.600000e+01 2.219000e+03 0.000000e+00 6.680127e-01
## std
## 1.158565e-02
Concordance: model1 69.4 model2 67.0 model3 67.2 model4 69.6 (overall account for meeting the model assumption and the multicollinearity, model 4 have the highest concordance score, although accompanied with “married” variable that is not statistically significant) model5 67.0 model6 66.8
dev_residuals <- residuals(cox_model4, type = "deviance")
plot(dev_residuals, main = "Deviance Residuals", ylab = "Residuals", xlab = "Index")
abline(h = c(-2, 2), col = "red", lty = 2)
surv_fit <- survfit(Surv(survival_months, status) ~ 1, data = final)
plot(surv_fit, xlab = "Time (months)", ylab = "Survival Probability",
main = "Survival Curve for the Final Model", col = "blue", lwd = 2)
grid()